package com.cy.bean.spider;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;

/**
 * 新闻爬虫处理规则
 * @author CY
 *
 */
public class NewsSpider implements PageProcessor {

    private Site site = Site.me().setRetryTimes(3).setSleepTime(100);

    @Override
    public void process(Page page) {
        page.addTargetRequests(page.getHtml().links().regex("http://news.ycw.gov.cn/html/\\d{4}-\\d{2}/\\d{2}/content_\\d{8}\\.htm").all());
        page.putField("title", page.getHtml().xpath("/html/body/div[9]/div/div/h1/text()").toString());
        page.putField("content", page.getHtml().xpath("/html/body/div[9]/div/div/i").toString());
        if (page.getResultItems().get("title")==null || page.getResultItems().get("content")==null){
            //skip this page
            page.setSkip(true);
        }
    }

    @Override
    public Site getSite() {
        return site;
    }
    
}
